/* bmw_small-tinyasm.S */
/*
    This file is part of the AVR-Crypto-Lib.
    Copyright (C) 2009  Daniel Otte (daniel.otte@rub.de)

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

/*
 * File:        bmw_small-tinyasm.S
 * Author:      Daniel Otte
 * Date:        2010-03-28
 * License:     GPLv3 or later
 * Description: implementation of BlueMidnightWish
 *
 */

#include "avr-asm-macros.S"

acc2 =  8
acc3 =  9
acc0 = 14
acc1 = 15

#define DEBUG 0

/******************************************************************************/
/*
  param a: r22:r23:r24:r25
  param s: r20
*/
shiftleft32:
	tst r20
	brpl 10f
	neg r20
	rjmp shiftright32
10:
	clr r0
	cpi r20, 8
	brlo bitrotateleft_1
	mov r25, r24
	mov r24, r23
	mov r23, r22
	clr r22
	subi r20, 8
	rjmp 10b

/******************************************************************************/
/*
  param a: r22:r23:r24:r25
  param s: r20
*/
shiftright32:
	cpi r20, 8
	brlo bitshiftright
	mov r22, r23
	mov r23, r24
	mov r24, r25
	clr r25
	subi r20, 8
	rjmp shiftright32
bitshiftright:
	tst r20
	breq 20f
10:	lsr r25
	ror r24
	ror r23
	ror r22
	dec r20
	brne 10b
20: ret

/******************************************************************************/
/*
  param a: r22:r23:r24:r25
  param s: r20
*/
rotateleft32:
	cpi r20, 8
	brlo bitrotateleft
	mov r0, r25
	mov r25, r24
	mov r24, r23
	mov r23, r22
	mov r22, r0
	subi r20, 8
	rjmp rotateleft32
bitrotateleft:
    mov r0, r25
bitrotateleft_1:
	tst r20
	breq 20f
10:
	lsl r0
rol32:
	rol r22
	rol r23
	rol r24
	rol r25
	dec r20
	brne 10b
20: ret


/******************************************************************************/

sn_stub:
	movw r22, r2
	movw r24, r4
	lpm r20, Z+
	rcall rotateleft32
eor32_to_acc:
	eor acc0, r22
	eor acc1, r23
	eor acc2, r24
	eor acc3, r25
	ret

s_table:
s0:  .byte 1, 3, 4,19
s1:  .byte 1, 2, 8,23
s2:  .byte 2, 1,12,25
s3:  .byte 2, 2,15,29
s4:  .byte 1, 0, 0, 0
s5:  .byte 2, 0, 0, 0

h0   = 10
h1   = 11
m0   = 12
m1   = 13

/*
  param x: r22:r23:r24:25
  param s: r20
*/
sn:
	push_range 2, 5
	push acc0
	push acc1
	push acc2
	push acc3
	ldi r30, lo8(s_table)
	ldi r31, hi8(s_table)
	lsl r20
	lsl r20
	add r30, r20
	adc r31, r1
	movw r2, r22
	movw r4, r24
	lpm r20, Z+
	rcall shiftright32
	rcall mov32_to_acc
;---
	movw r22, r2
	movw r24, r4
	lpm r20, Z+
	rcall shiftleft32
	rcall eor32_to_acc
;---
	rcall sn_stub
	rcall sn_stub

	movw r22, acc0
	movw r24, acc2
	pop acc3
	pop acc2
	pop acc1
	pop acc0
	rjmp pop5

/******************************************************************************/
/*
  param dest: r26:r27 (X)
  param src:  r30:r31 (Z)
  param len:  r20
*/
memxor_64:
;	tst r20
;	breq memxor_exit
	ldi r20, 64
memxor:
10: ld r21, X
	ld r22, Z+
	eor r21, r22
	st X+, r21
	dec r20
	brne 10b
memxor_exit:
	ret

/******************************************************************************/
q0 = 2
q1 = 3
h0 = 4
h1 = 5
m0 = 6
m1 = 7


/******************************************************************************/
load32_from_X:
	ld r22, X+
	ld r23, X+
	ld r24, X+
	ld r25, X+
	ret

load32_from_Y:
	ld r22, Y+
	ld r23, Y+
	ld r24, Y+
	ld r25, Y+
	ret

store32_to_Y:
	st Y+, r22
	st Y+, r23
	st Y+, r24
	st Y+, r25
	ret

add_X_to_32:
	ld r0, X+
	add r22, r0
	ld r0, X+
	adc r23, r0
	ld r0, X+
	adc r24, r0
	ld r0, X+
	adc r25, r0
	ret

store32_to_X:
	st X+, r22
	st X+, r23
	st X+, r24
	st X+, r25
	ret

mov32_to_acc:
	movw acc0, r22
	movw acc2, r24
	ret

/******************************************************************************/
/*
  param q:  r28:r29 (Y)
  param h:  r26:r27 (X)
  param m:  r30:r31 (Z)
*/

f2_1_shift_table:
;	.byte 0x2B, 0x64, 0x66, 0x03, 0x51, 0x55, 0x87, 0x55
;	.byte 0x55, 0x87, 0x55, 0x51, 0x03, 0x66, 0x64, 0x2B
	.byte 5, -5, -7, 8, -5, 5, -1, 5, -3, 0, 6, -6, -4, 6, -11, 2
f2_2_shift_table:
;	.byte (2<<1), (7<<1), (4<<1), (3<<1), (4<<1)+1, (6<<1)+1, (6<<1)
	.byte 8, -6, 6, 4, -3, -4, -7, -2
expand2_rot_table:
	.byte 3,7,13,16,19,23,27

f0_hacktable:
	.byte 0x03, 0x11, 5*4
	.byte 0xDD, 0xB3, 7*4
	.byte 0x2A, 0x79, 10*4
	.byte 0x07, 0xAA, 13*4
	.byte 0x51, 0xC2, 14*4


/*******************************************************************************
* uint32_t addelment(uint8_t j, const uint32_t* m, const uint32_t* h){
* 	uint32_t r;
*	r  = pgm_read_dword(k_lut+j);
*	r += rotl_addel(((uint32_t*)m)[j&0xf], j+0);
*	r += rotl_addel(((uint32_t*)m)[(j+3)&0xf], j+3);
*	r -= rotl_addel(((uint32_t*)m)[(j+10)&0xf], j+10);
*	r ^= ((uint32_t*)h)[(j+7)&0xf];
*	return r;
* }
* param j: r24
* param m: r22:r23
* param h: r20:r21
*/
j    = 16
acc2 =  8
acc3 =  9
h0   = 10
h1   = 11
m0   = 12
m1   = 13
acc0 = 14
acc1 = 15

load_acc_from_X:
	ld acc0, X+
	ld acc1, X+
	ld acc2, X+
	ld acc3, X+
	ret

add_acc_to_X:
	ld r0, X
	add r0, acc0
	st X+, r0
	ld r0, X
	adc r0, acc1
	st X+, r0
	ld r0, X
	adc r0, acc2
	st X+, r0
	ld r0, X
	adc r0, acc3
	st X+, r0
	ret

load_rotate_add_M:
	mov r20, j
	andi r20, 0x0f
	mov r0, r20
	lsl r0
	lsl r0
	movw r26, m0
	add r26, r0
	adc r27, r1
	rcall load32_from_X
	inc r20
	rcall rotateleft32
	brts 10f
	rjmp add32_to_acc
;	ret
10:	sub acc0, r22
	sbc acc1, r23
	sbc acc2, r24
	sbc acc3, r25
	ret


;---

/******************************************************************************/
load_sn_add:
	rcall load32_from_X
	rcall sn
add32_to_acc:
	add acc0, r22
	adc acc1, r23
	adc acc2, r24
	adc acc3, r25
	ret

/*
  param q: r26:r27
  param m: r22:r23
  param h: r20:r21
  param j: r24
*/

expand_intro:
	push_range 26, 27
	push r24
addelement:
	mov j, r24
	movw h0, r20
	movw m0, r22
	sbiw r26, 4
	rcall load_acc_from_X
	ldi r24, 0x55
	add acc0, r24
	adc acc1, r24
	adc acc2, r24
	ldi r24, 5
	adc acc3, r24
	rcall store_acc_to_dec_X
	adiw r26, 4
	clt
	rcall load_rotate_add_M
	subi j, -3
	rcall load_rotate_add_M
	set
	subi j, -7
	rcall load_rotate_add_M
	lsl j
	lsl j
	subi j, -7*4+10*4
	andi j, 0x3f
	movw r26, h0
	add r26, j
	adc r27, r1
	rcall load32_from_X
	rcall eor32_to_acc
;--
	pop r24
	pop_range 26, 27
	lsl r24
	lsl r24
	add r26, r24
	adc r27, r1
	ret
expand1:
	rcall expand_intro
	ldi r19, 1
10:
	mov r20, r19
	andi r20, 3
	rcall load_sn_add
	inc r19
	cpi r19, 17
	brne 10b
	rjmp expand2_exit


/******************************************************************************/
/*
  param q: r26:r27
  param m: r22:r23
  param h: r20:r21
  param j: r24
*/


expand2:
	rcall expand_intro
	ldi r19, 14
	ldi r30, lo8(expand2_rot_table)
	ldi r31, hi8(expand2_rot_table)
10:
	rcall load32_from_X
	sbrs r19, 0
	rjmp 12f
	lpm r20, Z+
	rcall rotateleft32
12:	rcall add32_to_acc
	dec r19
	brne 10b
	ldi r20, 4
	rcall load_sn_add
	ldi r20, 5
	rcall load_sn_add
expand2_exit:
	adiw r26, 4
store_acc_to_dec_X:
	st -X, acc3
	st -X, acc2
	st -X, acc1
	st -X, acc0
	ret

/******************************************************************************/
/*
  param q: r24:r25
  param m: r22:r23
  param h: r20:r21
*/
/* for calling expand1/2
  param q: r26:r27
  param m: r22:r23
  param h: r20:r21
  param j: r24
*/

/******************************************************************************/
/*
  param q: r24:r25
  param m: r22:r23
  param h: r20:r21
*/

/******************************************************************************/
/*
  param ctx:  r24:r25
  param msg:  r22:r23
*/
/* f0
  param q:  r28:r29 (Y)
  param h:  r26:r27 (X)
  param m:  r30:r31 (Z)
*/
/* f1
  param q: r24:r25
  param m: r22:r23
  param h: r20:r21
*/
/* f2
  param q: r24:r25
  param m: r22:r23
  param h: r20:r21
*/
q0 = 2
q1 = 3
h0 = 4
h1 = 5
m0 = 6
m1 = 7
ctx0 =   2
ctx1 =   3
msg0 =   4
msg1 =   5

restore_f1:
	movw r26, r2
	movw r22, r4
    movw r20, r6
	ret
bmw_small_nextBlock_early:
	movw r24, ctx0
	movw r22, msg0
.global bmw_small_nextBlock
.global bmw256_nextBlock
bmw_small_nextBlock:
bmw224_nextBlock:
bmw256_nextBlock:
	push_range  2, 7
	push_range 28, 29
	push_range  8, 17
	stack_alloc_large 32*4, r28, r29
	ldi r16, 0x4f
	push r16
	ldi r16, 0xff
	push r16
	push r16
	ldi r16, 0xfb
	push r16
	adiw r28, 1
;	push_range 28, 29 /* push Q */
;	push_range 22, 25 /* push M & H */
	/* increment counter */
	movw r26, r24
	movw r2, r26
	adiw r26, 63
	adiw r26,  1
	rcall load_acc_from_X
	ldi r19, 1
	add acc0, r19
	adc acc1, r1
	adc acc2, r1
	adc acc3, r1
	rcall store_acc_to_dec_X
	/* call f0 */
	movw r30, r22
	movw r26, r24
f0:
	movw h0, r26
	movw q0, r28
	movw m0, r30
	/* xor m into h */
;	ldi r20, 64
	rcall memxor_64
	movw r30, m0
	movw r26, h0

	/* set q to zero */
	ldi r22, 64
10:	st Y+, r1
	dec r22
	brne 10b
	movw r28, q0
	/* calculate W and store it in Q */
	ldi r19, 5
30:
	ldi r18, 16
	/* load initial index */

	/* load values from hacktable */
	ldi r30, lo8(f0_hacktable-3)
	ldi r31, hi8(f0_hacktable-3)
	mov r16, r19
	lsl r16
	add r16, r19
	add r30, r16
	adc r31, r1
	lpm r21, Z+
	lpm r20, Z+
	lpm r16, Z+
40:
	;call add_hx_to_w
add_hx_to_w:
	movw r26, h0
	add r26, r16
	adc r27, r1
	rcall load32_from_Y
	sbiw r28, 4
	lsl r20
	rol r21
	brcs 300f
	/* addition */
	rcall add_X_to_32
	rjmp 500f
300: /* substract */
	rcall load_acc_from_X
	sub r22, acc0
	sbc r23, acc1
	sbc r24, acc2
	sbc r25, acc3

500:
	rcall store32_to_Y
	subi r16, -4
	andi r16, 0x0f<<2
	dec r18
	brne 40b
	movw r28, q0
	dec r19
	brne 30b
	movw r26, h0
	/* xor m into h */
;	ldi r20, 64
	movw r26, h0
	movw r30, m0
	rcall memxor_64
	sbiw r26, 60
;---
	clr r17
	ldi r21, 15
	mov r8, r21
50:
	rcall load32_from_Y
	sbiw r28, 4
	mov r20, r17
	rcall sn
	inc r17
	cpi r17, 5
	brne 52f
	clr r17
52:
	rcall add_X_to_32
	rcall store32_to_Y

	dec r8
	brne 50b
;---
	rcall load32_from_Y
	clr r20
	rcall sn
	movw r26, h0
	rcall add_X_to_32
	sbiw r26, 4
	sbiw r28, 4
	rcall store32_to_Y
	sbiw r28, 4
	sbiw r28, 15*4
	movw r20, h0
	movw r22, m0

	/* call f1*/
	movw r2, r28
f1:
	movw r4, r22
	movw r6, r20
	movw r26, r2
	clr r24
	rcall expand1
	rcall restore_f1
	ldi r24, 1
	rcall expand1
	ldi r17, 2
10: rcall restore_f1
	mov r24, r17
	rcall expand2
	inc r17
	sbrs r17, 4
	rjmp 10b
	rcall restore_f1
	movw r24, r2


	/* call f2 */
;	pop_range 20, 25
;	push_range 20, 25
;	rcall printQ
;	push r20
;	push r21
acc2  =  8
acc3  =  9
acc0  = 14
acc1  = 15
xl0   =  2
xl1   =  3
xl2   =  4
xl3   =  5
xh0   =  6
xh1   =  7
xh2   = 10
xh3   = 11
q16_0 = 12
q16_1 = 13
h0   =  18
h1   =  19
f2:
	movw r26, r24
	/* calc XL & XH */
	adiw r26, 63
	adiw r26,  1
	movw q16_0, r26
	movw h0, r20
;---
;	push h0
;	push h1
;---
	movw r28, r22
	rcall load_acc_from_X
	ldi r17, 15
10:	rcall load32_from_X
	rcall eor32_to_acc
	cpi r17, 9
	brne 15f
	movw xl0, acc0
	movw xl2, acc2
15:
	dec r17
	brne 10b
	movw xh0, acc0
	movw xh2, acc2
;--- DBG
;	push_range 22, 25
;	movw r22, xl0
;	movw r24, xl2
;	rcall print32
;	movw r22, xh0
;	movw r24, xh2
;	rcall print32
;	pop_range 22, 25
;--- END DBG
	 /* copy m(Y) into h */
	movw r26, h0
	ldi r22, 64
10:
	ld r23, Y+
	st X+, r23
	dec r22
	brne 10b
;--- /* calc first half of h0..h15 */
	movw r28, q16_0
	movw r26, h0
	ldi r30, lo8(f2_1_shift_table)
	ldi r31, hi8(f2_1_shift_table)
	ldi r17, 15
10:
;---
	movw r22, xh0
	movw r24, xh2
  	lpm r20, Z+
	sbrc r17, 3
 	rcall shiftleft32
    rcall mov32_to_acc
;---
	rcall load32_from_Y
	lpm r20, Z+
	sbrc r17, 3
	rcall shiftleft32
	rcall eor32_to_acc
;---
	rcall load32_from_X
	rcall eor32_to_acc
	rcall store_acc_to_dec_X
	adiw r26, 4
;---
	dec r17
	brpl 10b
;-----
	sbiw r28, 4*8 /* Y points to q[24] */
	movw r30, r28
	sbiw r28, 63
	sbiw r28, 33 /* Y points to q[0] */
	movw r26, r28
	ldi r20, 8*4
	/* xor q[24..31] into q[0..7] */
	rcall memxor
	/* xor q[23] into q[8] */
	sbiw r30, 9*4
	ldi r20, 4
	rcall memxor
	/* xor q[16..22] into q[9..15] */
	sbiw r30, 8*4
	ldi r20, 7*4
	rcall memxor

	movw r26, h0
	ldi r17, 15
	ldi r30, lo8(f2_2_shift_table-8)
	ldi r31, hi8(f2_2_shift_table-8)
10:	movw r22, xl0
	movw r24, xl2
	lpm r20, Z+
	sbrs r17, 3
	rcall shiftleft32
	rcall mov32_to_acc
	rcall load32_from_Y
	rcall eor32_to_acc
	rcall add_acc_to_X
	dec r17
	brpl 10b
;-----
	sbiw r26, 8*4 /* X points to h8 */
	movw r28, r26
	sbiw r28, 4*4 /* Y points to h4 */
	ldi r17, 8
	ldi r18, 9
10:
	rcall load32_from_Y
	mov r20, r18
	rcall rotateleft32
	rcall mov32_to_acc
	rcall add_acc_to_X
	inc r18
	cpi r17, 5
	brne 20f
	sbiw r28, 8*4
20:	dec r17
	brne 10b

exit:
;--- DBG
;	pop r25
;	pop r24
;	ldi r22, 'H'
;	rcall printX
;--- END DBG
	stack_free_large3 32*4+4
	pop_range 10, 17
pop9:
	pop_range 8, 9
pop28:
	pop_range 28, 29
pop7:
	pop_range 6, 7
pop5:
	pop_range 2, 5
	ret

/******************************************************************************/
ctx0 =  2
ctx1 =  3
blc0 =  4
blc1 =  5
len0 = 28
len1 = 29
buf0 =  6
buf1 =  7

load32_from_Z_stub:
	movw r30, ctx0
	adiw r30, 60
	ldd r21, Z+4
	ldd r22, Z+5
	ldd r23, Z+6
	ldd r24, Z+7
	ret

/******************************************************************************/
/*
  param ctx:  r24:r25
  param msg:  r22:r23
  param len:  r20:r21
*/

.global bmw_small_lastBlock
.global bmw256_lastBlock
bmw_small_lastBlock:
bmw224_lastBlock:
bmw256_lastBlock:
/*	while(length_b >= BMW_SMALL_BLOCKSIZE){
		bmw_small_nextBlock(ctx, block);
		length_b -= BMW_SMALL_BLOCKSIZE;
		block = (uint8_t*)block + BMW_SMALL_BLOCKSIZE_B;
	}
*/
	push_range 2, 7
	push_range 28, 29
	movw ctx0, r24
	movw blc0, r22
	movw len0, r20
1:
	cpi len1, hi8(512)
	brlo 2f
	rcall bmw_small_nextBlock_early
	ldi r24, 64
	add blc0, r24
	adc blc1, r1
	subi len1, hi8(512)
	rjmp 1b
2:
/*	struct {
		uint8_t  buffer[64];
		uint32_t ctr;
	} pctx;
*/
	stack_alloc_large 68
	adiw r30, 1
	movw buf0, r30
/*	memset(pctx.buffer, 0, 64);
	memcpy(pctx.buffer, block, (length_b+7)/8);
	pctx.buffer[length_b>>3] |= 0x80 >> (length_b&0x07);
*/	movw r24, len0
	ldi r23, 63
	movw r26, blc0
	lsr r25
	ror r24
	lsr r24
	lsr r24
	breq 301f
	sub r23, r24
	/* copy (#r24) bytes to stack buffer */
30: ld r20, X+
	st Z+, r20
	dec r24
	brne 30b
301: /* calculate the appended byte */
	clr r20
	mov r21, len0
	ldi r24, 0x80
	andi r21, 0x07
	breq 305f
	ld r20, X+
303:
	lsr r24
	dec r21
	brne 303b
305:
	or r20, r24
	st Z+, r20
	tst r23
	breq 32f
31: st Z+, r1
	dec r23
	brne 31b
32:
/*	if(length_b+1>64*8-64){ ; = 64*7-1 = 447 max(length_b)=511
		bmw_small_nextBlock(ctx, pctx.buffer);
		memset(pctx.buffer, 0, 64-8);
		ctx->counter -= 1;
	}
*/
	tst len1
	breq 400f
	cpi len0, 192
	brlo 400f
	movw blc0, buf0
	rcall bmw_small_nextBlock_early
	movw r26, buf0
	ldi r20, 64-8
350:
	st X+, r1
	dec r20
	brne 350b
	rcall load32_from_Z_stub
	subi r21, 1
	sbc r22, r1
	sbc r23, r1
	sbc r24, r1
	rjmp 410f
/*	*((uint64_t*)&(pctx.buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b;
	bmw_small_nextBlock(ctx, pctx.buffer);
*/
400:
	rcall load32_from_Z_stub
410:
	clr r25
	ldi r20, 1
	lsl r21
	rcall rol32
	mov r20, len0
	add r21, len1
	adc r22, r1
	adc r23, r1
	adc r24, r1
	adc r25, r1
	movw r26, buf0
	adiw r26, 64-8
	st X+, r20
	st X+, r21
	rcall store32_to_X
	st X+, r1
	st X+, r1
	movw blc0, buf0
	rcall bmw_small_nextBlock_early
/*	memset(pctx.buffer, 0xaa, 64);
	for(i=0; i<16;++i){
		pctx.buffer[i*4] = i+0xa0;
	}
*/
	ldi r22, 0xa0
	ldi r23, 0xaa
	ldi r24, 0xaa
	ldi r25, 0xaa
	movw r26, buf0
500:
	rcall store32_to_X
	inc r22
	sbrs r22, 4
	rjmp 500b
/*	bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h);
	memcpy(ctx->h, pctx.buffer, 64);
*/
    movw r24, buf0
    movw r22, ctx0
    rcall bmw_small_nextBlock
	ldi r18, 64
	movw r26, ctx0
	movw r30, buf0
600:
	ld r20, Z+
	st X+, r20
	dec r18
	brne 600b

	stack_free_large 68
	rjmp pop28


/*******************************************************************************
* void bmw256_ctx2hash(void* dest, const bmw256_ctx_t* ctx){
*	memcpy(dest, &(ctx->h[8]), 256/8);
* }
*
* param dest:  r24:r25
* param ctx:   r22:r23
*/
.global bmw256_ctx2hash
bmw256_ctx2hash:
	movw r30, r22
	adiw r30, 8*4
	ldi r18, 32
1:	movw r26, r24
1:  ld r23, Z+
	st X+, r23
	dec r18
	brne 1b
	ret

/*******************************************************************************
* void bmw256(void* dest, const void* msg, uint32_t length_b){
*	bmw_small_ctx_t ctx;
*	bmw256_init(&ctx);
*	while(length_b>=BMW_SMALL_BLOCKSIZE){
*		bmw_small_nextBlock(&ctx, msg);
*		length_b -= BMW_SMALL_BLOCKSIZE;
*		msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B;
*	}
*	bmw_small_lastBlock(&ctx, msg, length_b);
*	bmw256_ctx2hash(dest, &ctx);
* }
*
* param dest:     r24:r25
* param msg:      r22:r23
* param length_b: r18:r21
*/
ctx0 =   2
ctx1 =   3
msg0 =   4
msg1 =   5
len0 =   6
len1 =   7
len2 =   8
len3 =   9
dst0 =  10
dst1 =  11


/*******************************************************************************
* void bmw224(void* dest, const void* msg, uint32_t length_b){
*	bmw_small_ctx_t ctx;
*	bmw224_init(&ctx);
*	while(length_b>=BMW_SMALL_BLOCKSIZE){
*		bmw_small_nextBlock(&ctx, msg);
*		length_b -= BMW_SMALL_BLOCKSIZE;
*		msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B;
*	}
*	bmw_small_lastBlock(&ctx, msg, length_b);
*	bmw224_ctx2hash(dest, &ctx);
* }
*
* param dest:     r24:r25
* param msg:      r22:r23
* param length_b: r18:r21
*/
ctx0 =   2
ctx1 =   3
msg0 =   4
msg1 =   5
len0 =  28
len1 =  29
len2 =   8
len3 =   9
dst0 =   6
dst1 =   7


.global bmw256
bmw256:
	push_range 2, 7
	push_range 28, 29
	push_range 8, 9
	stack_alloc_large 64+4
	adiw r30, 1
10:	movw ctx0, r30
	movw dst0, r24
	movw msg0, r22
	movw len0, r18
	movw len2, r20
	movw r24, ctx0
	rcall bmw256_init
20:
	mov r18, len2
	or  r18, len3
	breq 50f
	rcall bmw_small_nextBlock_early
	subi len1, 2
	sbc len2, r1
	sbc len3, r1
	ldi r20, 64
	add msg0, r20
	adc msg1, r1
	rjmp 20b
50:
	movw r24, ctx0
	movw r22, msg0
	movw r20, len0
	rcall bmw_small_lastBlock
	movw r24, dst0
	movw r22, ctx0
	rcall bmw256_ctx2hash
	stack_free_large 64+4
	rjmp pop9

/******************************************************************************/
.global bmw256_init
bmw256_init:
	ldi r22, 0x40
	ldi r23, 0x80
	movw r26, r24
	adiw r26, 4
10:
	st -X, r22
	inc r22
	mov r20, r22
	andi r20, 0x3
	brne 10b
	adiw r26, 8
20: cp r22, r23
	brne 10b
	st -X, r1
	st -X, r1
	st -X, r1
	st -X, r1
	ret


/******************************************************************************/

#if DEBUG

printQ:
	push_range 20, 25
	ldi r16, 4
	mov r9, r16
	movw r16, r24
	ldi r24, lo8(qdbg_str)
	ldi r25, hi8(qdbg_str)
	call cli_putstr_P
	clr r8
10:	ldi r24, lo8(qdbg_str1)
	ldi r25, hi8(qdbg_str1)
	call cli_putstr_P
	mov r24, r8
	call cli_hexdump_byte
	ldi r24, lo8(qdbg_str2)
	ldi r25, hi8(qdbg_str2)
	call cli_putstr_P
	movw r24, r16
	clr r23
	ldi r22, 4
	call cli_hexdump_rev
	add r16, r9
	adc r17, r1
	inc r8
	sbrs r8, 5
	rjmp 10b
	pop_range 20, 25
	ret
qdbg_str:  .asciz "\r\nDBG Q: "
qdbg_str1: .asciz "\r\n Q["
qdbg_str2: .asciz "] =  "


printX:
	push_range 6, 9
	push_range 16, 27
	push_range 30, 31
	ldi r16, 4
	mov r6, r22
	mov r9, r16
	movw r16, r24
	ldi r24, lo8(Xdbg_str)
	ldi r25, hi8(Xdbg_str)
	call cli_putstr_P
	mov r24, r6
	call cli_putc
	ldi r24, ':'
	call cli_putc
	clr r8
10:	ldi r24, lo8(Xdbg_str1)
	ldi r25, hi8(Xdbg_str1)
	call cli_putstr_P
	mov r24, r6
	call cli_putc
	ldi r24, '['
	call cli_putc
	mov r24, r8
	call cli_hexdump_byte
	ldi r24, lo8(Xdbg_str2)
	ldi r25, hi8(Xdbg_str2)
	call cli_putstr_P
	movw r24, r16
	clr r23
	ldi r22, 4
	call cli_hexdump_rev
	add r16, r9
	adc r17, r1
	inc r8
	sbrs r8, 4
	rjmp 10b
	pop_range 30, 31
	pop_range 16, 27
	pop_range 6, 9
	ret
Xdbg_str:  .asciz "\r\nDBG "
Xdbg_str1: .asciz "\r\n "
Xdbg_str2: .asciz "] = "

print32:
	push_range 6, 9
	push_range 16, 27
	push_range 30, 31
	movw r6, r22
	movw r8, r24
	ldi r24, lo8(Xdbg_str)
	ldi r25, hi8(Xdbg_str)
	call cli_putstr_P
	mov r24, r9
	call cli_hexdump_byte
	mov r24, r8
	call cli_hexdump_byte
	mov r24, r7
	call cli_hexdump_byte
	mov r24, r6
	call cli_hexdump_byte
	pop_range 30, 31
	pop_range 16, 27
	pop_range 6, 9
	ret


print_acc:
	push_range 16, 27
	push_range 30, 31
	ldi r24, lo8(Xdbg_str)
	ldi r25, hi8(Xdbg_str)
	call cli_putstr_P
	mov r24, r9
	call cli_hexdump_byte
	mov r24, r8
	call cli_hexdump_byte
	mov r24, r15
	call cli_hexdump_byte
	mov r24, r14
	call cli_hexdump_byte
	pop_range 30, 31
	pop_range 16, 27
	ret

#endif

